STAT 206 Final Project¶

Dataset: Loan Status Prediction
https://www.kaggle.com/datasets/bhavikjikadara/loan-status-prediction/data

Team Members: Ankit Malhotra, Nathaniel Zhu

In [ ]:
versioninfo()
Julia Version 1.10.0
Commit 3120989f39b (2023-12-25 18:01 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: macOS (arm64-apple-darwin22.4.0)
  CPU: 12 × Apple M2 Pro
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1)
  Threads: 1 on 8 virtual cores
Environment:
  JULIA_NUM_THREADS = 
In [ ]:
import Pkg
Pkg.activate()
Pkg.status()
  Activating project at `~/.julia/environments/v1.10`
Status `~/.julia/environments/v1.10/Project.toml`
  [cbdf2221] AlgebraOfGraphics v0.6.18
  [024491cd] BetaML v0.11.4
⌃ [336ed68f] CSV v0.10.12
  [13f3f980] CairoMakie v0.11.9
  [e2e10f9a] CatBoost v0.3.4
  [324d7699] CategoricalArrays v0.10.8
  [aaaa29a8] Clustering v0.15.7
  [8f4d0f93] Conda v1.10.0
  [a93c6f00] DataFrames v1.6.1
⌃ [1313f7d8] DataFramesMeta v0.14.1
  [b4f34e82] Distances v0.10.11
  [31c24e10] Distributions v0.25.107
⌃ [5789e2e9] FileIO v1.16.2
⌃ [587475ba] Flux v0.14.12
  [da1fdf0e] FreqTables v0.4.6
  [38e38edf] GLM v1.9.0
  [8d5ece8b] GLMNet v0.7.2
  [e9467ef8] GLMakie v0.9.9
  [09f84164] HypothesisTests v0.11.0
  [7073ff75] IJulia v1.24.2
  [4e3cecfd] ImageShow v0.3.8
  [f0e99cf1] MLBase v0.9.2
  [eb30cadb] MLDatasets v0.7.14
⌃ [add582a8] MLJ v0.20.2
  [d354fa79] MLJClusteringInterface v0.1.11
  [c6f25543] MLJDecisionTreeInterface v0.4.1
  [094fc8d1] MLJFlux v0.4.0
  [caf8df21] MLJGLMInterface v0.3.7
  [61c7150f] MLJLIBSVMInterface v0.2.1
  [6ee0df7b] MLJLinearModels v0.10.0
⌃ [d491faf4] MLJModels v0.16.15
  [1b6a4a23] MLJMultivariateStatsInterface v0.5.3
  [33e4bacb] MLJNaiveBayesInterface v0.1.6
  [5ae90465] MLJScikitLearnInterface v0.6.1
  [636a865e] NearestNeighborModels v0.2.3
  [8b842266] PalmerPenguins v0.1.4
⌃ [91a5bcdd] Plots v1.40.1
  [8162dcfd] PrettyPrint v0.2.0
  [ce6b1742] RDatasets v0.7.7
  [321657f4] ScientificTypes v3.0.2
  [8e980c4a] Shapefile v0.12.0
  [de6bee2f] SimpleChains v0.4.6
  [860ef19b] StableRNGs v1.0.1
  [2913bbd2] StatsBase v0.34.2
  [f3b207a7] StatsPlots v0.15.7
  [40c74d1a] TableView v0.7.2
  [fdbf4ff8] XLSX v0.10.1
  [37e2e46d] LinearAlgebra
  [9a3f8284] Random
Info Packages marked with ⌃ have new versions available and may be upgradable.

Packages¶

In [ ]:
# Loading Necessary Packages
using CSV, DataFrames, Shapefile
using CategoricalArrays, FreqTables
using Plots, StatsPlots, Statistics
using LinearAlgebra, StatsBase, HypothesisTests
using Distributions, Random, StableRNGs
using PalmerPenguins, RDatasets
using MLJ, NearestNeighborModels, MLJScikitLearnInterface, MLJMultivariateStatsInterface
using MLJDecisionTreeInterface, MLJLinearModels
using CatBoost
InitError: Python: ModuleNotFoundError: No module named 'catboost'

Python stacktrace: none

during initialization of module CatBoost



Stacktrace:

  [1] pythrow()

    @ PythonCall ~/.julia/packages/PythonCall/wXfah/src/err.jl:94

  [2] errcheck

    @ PythonCall ~/.julia/packages/PythonCall/wXfah/src/err.jl:10 [inlined]

  [3] pyimport(m::String)

    @ PythonCall ~/.julia/packages/PythonCall/wXfah/src/concrete/import.jl:11

  [4] __init__()

    @ CatBoost ~/.julia/packages/CatBoost/TiqIz/src/CatBoost.jl:16

  [5] run_module_init(mod::Module, i::Int64)

    @ Base ./loading.jl:1128

  [6] register_restored_modules(sv::Core.SimpleVector, pkg::Base.PkgId, path::String)

    @ Base ./loading.jl:1116

  [7] _include_from_serialized(pkg::Base.PkgId, path::String, ocachepath::String, depmods::Vector{Any})

    @ Base ./loading.jl:1061

  [8] _require_search_from_serialized(pkg::Base.PkgId, sourcepath::String, build_id::UInt128)

    @ Base ./loading.jl:1575

  [9] _require(pkg::Base.PkgId, env::String)

    @ Base ./loading.jl:1932

 [10] __require_prelocked(uuidkey::Base.PkgId, env::String)

    @ Base ./loading.jl:1806

 [11] #invoke_in_world#3

    @ Base ./essentials.jl:921 [inlined]

 [12] invoke_in_world

    @ Base ./essentials.jl:918 [inlined]

 [13] _require_prelocked(uuidkey::Base.PkgId, env::String)

    @ Base ./loading.jl:1797

 [14] macro expansion

    @ Base ./loading.jl:1784 [inlined]

 [15] macro expansion

    @ Base ./lock.jl:267 [inlined]

 [16] __require(into::Module, mod::Symbol)

    @ Base ./loading.jl:1747

 [17] #invoke_in_world#3

    @ Base ./essentials.jl:921 [inlined]

 [18] invoke_in_world

    @ Base ./essentials.jl:918 [inlined]

 [19] require(into::Module, mod::Symbol)

    @ Base ./loading.jl:1740

 [20] eval

    @ ./boot.jl:385 [inlined]

 [21] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)

    @ Base ./loading.jl:2070

 [22] #invokelatest#2

    @ ./essentials.jl:887 [inlined]

 [23] invokelatest

    @ ./essentials.jl:884 [inlined]

 [24] (::VSCodeServer.var"#214#215"{VSCodeServer.NotebookRunCellArguments, String})()

    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:19

 [25] withpath(f::VSCodeServer.var"#214#215"{VSCodeServer.NotebookRunCellArguments, String}, path::String)

    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/repl.jl:274

 [26] notebook_runcell_request(conn::VSCodeServer.JSONRPC.JSONRPCEndpoint{Base.PipeEndpoint, Base.PipeEndpoint}, params::VSCodeServer.NotebookRunCellArguments)

    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:13

 [27] dispatch_msg(x::VSCodeServer.JSONRPC.JSONRPCEndpoint{Base.PipeEndpoint, Base.PipeEndpoint}, dispatcher::VSCodeServer.JSONRPC.MsgDispatcher, msg::Dict{String, Any})

    @ VSCodeServer.JSONRPC ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/JSONRPC/src/typed.jl:67

 [28] serve_notebook(pipename::String, outputchannel_logger::Base.CoreLogging.SimpleLogger; crashreporting_pipename::String)

    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:139

 [29] top-level scope

    @ ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/notebook/notebook.jl:35

About this dataset and our eventual goal:

  1. In this Loan Status Prediction dataset, we have the data of applicants who previously applied for the loan based on the property which is a Property Loan.
  2. Decide whether to give a loan to the applicant based on some factors such as Applicant Income, Loan Amount, previous Credit History, Co-applicant Income, etc…
  3. Our goal is to build a Machine Learning Model to predict the loan to be approved or to be rejected for an applicant.

Main Coding¶

Part 1: Data Importing and Cleaning¶

In [ ]:
# Read the CSV file into a DataFrames
loan_data = CSV.read("Data/loan_data.csv", DataFrame)

# Display first few rows
first(loan_data, 10)
10×13 DataFrame
RowLoan_IDGenderMarriedDependentsEducationSelf_EmployedApplicantIncomeCoapplicantIncomeLoanAmountLoan_Amount_TermCredit_HistoryProperty_AreaLoan_Status
String15String7?String3String3?String15String3?Int64Float64Float64Float64?Float64?String15String1
1LP001003MaleYes1GraduateNo45831508.0128.0360.01.0RuralN
2LP001005MaleYes0GraduateYes30000.066.0360.01.0UrbanY
3LP001006MaleYes0Not GraduateNo25832358.0120.0360.01.0UrbanY
4LP001008MaleNo0GraduateNo60000.0141.0360.01.0UrbanY
5LP001013MaleYes0Not GraduateNo23331516.095.0360.01.0UrbanY
6LP001024MaleYes2GraduateNo3200700.070.0360.01.0UrbanY
7LP001027MaleYes2Graduatemissing25001840.0109.0360.01.0UrbanY
8LP001029MaleNo0GraduateNo18532840.0114.0360.01.0RuralN
9LP001030MaleYes2GraduateNo12991086.017.0120.01.0UrbanY
10LP001032MaleNo0GraduateNo49500.0125.0360.01.0UrbanY
In [ ]:
names(loan_data)
13-element Vector{String}:
 "Loan_ID"
 "Gender"
 "Married"
 "Dependents"
 "Education"
 "Self_Employed"
 "ApplicantIncome"
 "CoapplicantIncome"
 "LoanAmount"
 "Loan_Amount_Term"
 "Credit_History"
 "Property_Area"
 "Loan_Status"
In [ ]:
# Get an overview of the Data
describe(loan_data)
13×7 DataFrame
Rowvariablemeanminmedianmaxnmissingeltype
SymbolUnion…AnyUnion…AnyInt64Type
1Loan_IDLP001003LP0029900String15
2GenderFemaleMale5Union{Missing, String7}
3MarriedNoYes0String3
4Dependents03+8Union{Missing, String3}
5EducationGraduateNot Graduate0String15
6Self_EmployedNoYes21Union{Missing, String3}
7ApplicantIncome3579.851503333.097030Int64
8CoapplicantIncome1277.280.0983.033837.00Float64
9LoanAmount104.9879.0110.0150.00Float64
10Loan_Amount_Term340.86512.0360.0480.011Union{Missing, Float64}
11Credit_History0.8376070.01.01.030Union{Missing, Float64}
12Property_AreaRuralUrban0String15
13Loan_StatusNY0String1
In [ ]:
# Check the size of the DataFrame
size(loan_data)
(381, 13)

Couple of points to note here :

  1. We have missing values in dataset in the first look, so that needs to be taken care of.
  2. There is a lot of categorical data : Gender, Married, Education level, self-employed and property area. We need to convert this into numerical, so that we can use them for EDA and regression / classification as well.
  3. Our target variable is 'Loan_Status' which is a dichotomous variable having two categories i.e., "Y" and "N". So our task will be to predict whether the data which we have, help us understand whether someone will or will not get a loan application based on all the given data to us.
  4. The dependents variable is very dirty with special characters playing the boundaries of data integrity again.

Ways of dealing with missing data :

    We could calculate randomized averages, and fill out the data wherever possible

    But that will play with the boundaries of data integrity which is not desirable, given that the dataset is not large enough. Instead we decided to just drop the missing values

In [ ]:
# Remove rows with any missing values 
clean_data = dropmissing(loan_data)

# Check size again after cleaning
size(clean_data)
(308, 13)
In [ ]:
# Check for missing values
for col in names(clean_data)
    missing_count = sum(ismissing, clean_data[!, col])
    println("Column: $col has $missing_count missing values")
end
Column: Loan_ID has 0 missing values
Column: Gender has 0 missing values
Column: Married has 0 missing values
Column: Dependents has 0 missing values
Column: Education has 0 missing values
Column: Self_Employed has 0 missing values
Column: ApplicantIncome has 0 missing values
Column: CoapplicantIncome has 0 missing values
Column: LoanAmount has 0 missing values
Column: Loan_Amount_Term has 0 missing values
Column: Credit_History has 0 missing values
Column: Property_Area has 0 missing values
Column: Loan_Status has 0 missing values

After cleaning the data, it also appears that some columns are marked as String, let us convert these columns back to Categorical for better analysis in Julia.

In [ ]:
# Copy clean_data to a new DataFrame called real_data to preserve the original data
real_data = copy(clean_data)

# Transforming string variables into numeric variables
real_data[!, :Loan_Status] = ifelse.(real_data[!, :Loan_Status] .== "Y", 1, 0)
real_data[!, :Gender] = ifelse.(real_data[!, :Gender] .== "Female", 1, 0)
real_data[!, :Married] = ifelse.(real_data[!, :Married] .== "Yes", 1, 0)
real_data[!, :Education] = ifelse.(real_data[!, :Education] .== "Graduate", 1, 0)
real_data[!, :Self_Employed] = ifelse.(real_data[!, :Self_Employed] .== "Yes", 1, 0)

# Transform Property_Area into numeric variable with Urban -> 2, SemiUrban -> 1, Rural -> 0
real_data[!, :Property_Area] = map(x -> x == "Urban" ? 2 : x == "Semiurban" ? 1 : 0, real_data[!, :Property_Area])

# Replace "3+" with 3 in Dependents and convert to integer
real_data[!, :Dependents] = replace.(real_data[!, :Dependents], "3+" => "3")
real_data[!, :Dependents] = parse.(Int, real_data[!, :Dependents])

describe(real_data)
13×7 DataFrame
Rowvariablemeanminmedianmaxnmissingeltype
SymbolUnion…AnyUnion…AnyInt64DataType
1Loan_IDLP001003LP0029900String15
2Gender0.20454500.010Int64
3Married0.60064901.010Int64
4Dependents0.67857100.030Int64
5Education0.74350601.010Int64
6Self_Employed0.090909100.010Int64
7ApplicantIncome3599.131503329.597030Int64
8CoapplicantIncome1278.430.0871.533837.00Float64
9LoanAmount104.6239.0110.0150.00Float64
10Loan_Amount_Term341.18236.0360.0480.00Float64
11Credit_History0.8538960.01.01.00Float64
12Property_Area1.0422101.020Int64
13Loan_Status0.71103901.010Int64

Part 2: Data Visualization¶

Now it is time for the Visualization. Below, there are some graphs consisted of all possible visualization for all variables. Please feel free to take a look of our basic data visualization.

In [ ]:
# Calculate the counts for each Loan_Status category
loan_status_counts = combine(groupby(clean_data, :Loan_Status), nrow => :count)

# Generate the bar plot
bar_plot = bar(loan_status_counts.Loan_Status, loan_status_counts.count, 
    xlabel = "Loan Status", 
    ylabel = "Count", 
    title = "Distribution of Loan Status",
    legend = false)

# Add the count annotations on top of each bar
for (x, y) in zip(loan_status_counts.Loan_Status, loan_status_counts.count)
    annotate!(bar_plot, [(x, y, text(string(y), 8, :center, :bottom))])
end

# Printing Output
bar_plot
In [ ]:
# 2. Applicant Income By Loan Status
boxplot(real_data[!, :Loan_Status], real_data[!, :ApplicantIncome], 
    title = "Applicant Income by Loan Status", 
    ylabel = "Applicant Income",
    legend = false
    )
In [ ]:
# 3. Gender Histogram
histogram(real_data[!, :Gender], bins = 2, title = "Gender Distribution", legend = false)
In [ ]:
# 4. Married Histogram
histogram(real_data[!, :Married], bins = 2, title = "Married Status Distribution", legend = false)
In [ ]:
# 5. Dependent Histogram
histogram(real_data[!, :Dependents], title = "Dependents Distribution", legend = false)
In [ ]:
# 6. Education Histogram
histogram(real_data[!, :Education], bins = 2, title = "Education Distribution", legend = false)
In [ ]:
# 7. Self_Employed Histogram
histogram(real_data[!, :Self_Employed], bins = 2, title = "Self Employed Distribution", legend = false)
In [ ]:
# 8. ApplicantIncome Histogram
histogram(real_data[!, :ApplicantIncome], title = "Applicant Income Distribution", xlabel = "Income", ylabel = "Frequency", legend = false)
In [ ]:
# 9. CoApplicantIncome Histogram
histogram(real_data[!, :CoapplicantIncome], title = "Co-Applicant Income Distribution", xlabel = "Income", ylabel = "Frequency", legend = false)
In [ ]:
# 10. LoanAmount Histogram
loan_amount_hist = histogram(real_data[!, :LoanAmount], title = "Loan Amount Distribution", xlabel = "Amount", ylabel = "Frequency", legend = false)
In [ ]:
# 11. Loan_Amount_Term Histogram
loan_amount_term_hist = histogram(real_data[!, :Loan_Amount_Term], title = "Loan Amount Term Distribution", xlabel = "Term", ylabel = "Frequency", legend = false)
In [ ]:
# 12. Credit History Histogram
credit_history_hist = histogram(real_data[!, :Credit_History], bins = 2, title = "Credit History Distribution", legend = false)
In [ ]:
# 13. Property Area Histogram
property_area_hist = histogram(real_data[!, :Property_Area], title = "Property Area Distribution", xlabel = "Area", ylabel = "Frequency", legend = false)
In [ ]:
# 14. Property_Area vs Loan Amount Dot plot
xs = real_data[!, :Property_Area]
ys = real_data[!, :LoanAmount]

dotplot(xs, ys,
    title = "Property Area and Loan Amount Dot Plot",
    xlabel = "Property Area",
    ylabel = "Loan Amount",
    group = xs,
)
In [ ]:
# 15. Property_Area vs 
xs = real_data[!, :Property_Area]
ys = real_data[!, :Loan_Amount_Term]

violin(xs, ys,
    title = "Property and Loan Amount Term Violin Plot",
    xlabel = "Property Area",
    ylabel = "Loan_Amount_Term",
    group = xs,
)
In [ ]:
# 16. Correlation Plot Analysis part 1
selected_cols = [
    :Gender,
    :Married,
    :Education,
    :Dependents,
    :Self_Employed,
    :Property_Area,
    :Loan_Status
    ]
@df real_data corrplot(cols(selected_cols),
    size = (1000, 1000),
    bins = 32,
    title = "Correlation Plot",
    xlabel = "Variable",
    ylabel = "Variable",
    labelfontsize = 8,
    tickfontsize = 5,
    linewidth = 2,
    markersize = 10,
    clim = (-1, 1),
    colorbar_title = "Correlation Coefficient"
)
In [ ]:
# 17. Correlation Plot Analysis part 2
selected_cols = [
    :ApplicantIncome,
    :LoanAmount,
    :Loan_Amount_Term,
    :Credit_History,
    :Loan_Status
    ]
@df real_data corrplot(cols(selected_cols),
    size = (1000, 1000),
    bins = 32,
    title = "Correlation Plot",
    xlabel = "Variable",
    ylabel = "Variable",
    labelfontsize = 8,
    tickfontsize = 5,
    linewidth = 2,
    markersize = 10,
    clim = (-1, 1),
    colorbar_title = "Correlation Coefficient"
)
In [ ]:
# 18. Correlation Matrix for All Variables
M = cor(Matrix(real_data[!, 2:13]))

vars = names(real_data)[2:13]

fig = heatmap(M,
  title = "Correlation Matrix",
  clims = (-1, 1),
  xticks = (2:13, vars),
  yticks = (2:13, vars),
  color = cgrad(:balance, rev = true),
  xrot = 45,
  aspect = :ratio,
  size = (700, 600),
)

for j in axes(M, 2), i in axes(M, 1)
  annotate!(i, j, text("$(round(M[i,j], digits = 2))", :white, 12))
end

fig

Data Modeling Starts Here¶

In [ ]:
select!(real_data, Not(:Loan_ID))
308×12 DataFrame
283 rows omitted
RowGenderMarriedDependentsEducationSelf_EmployedApplicantIncomeCoapplicantIncomeLoanAmountLoan_Amount_TermCredit_HistoryProperty_AreaLoan_Status
Int64Int64Int64Int64Int64Int64Float64Float64Float64Float64Int64Int64
10111045831508.0128.0360.01.000
20101130000.066.0360.01.021
30100025832358.0120.0360.01.021
40001060000.0141.0360.01.021
50100023331516.095.0360.01.021
6012103200700.070.0360.01.021
70001018532840.0114.0360.01.000
80121012991086.017.0120.01.021
90001049500.0125.0360.01.021
101001035100.076.0360.00.020
110100048870.0133.0360.01.000
120100076600.0104.0360.00.020
130100026001911.0116.0360.00.010
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2970111027871917.0146.0360.00.000
2980101022971522.0104.0360.01.021
2991000021650.070.0360.01.011
3000121127260.0106.0360.00.010
3010101030003416.056.0180.01.011
3020101038593300.0142.0180.01.001
3030000038330.0110.0360.01.001
3040131057030.0128.0360.01.021
3050101032321950.0108.0360.01.001
3061001029000.071.0360.01.001
3070131041060.040.0180.01.001
3081001145830.0133.0360.00.010
In [ ]:
hot = MLJ.fit!(machine(OneHotEncoder(), real_data))

# apply the dummy coding scheme; note that we qualify `transform`
data_hot_encoded = MLJ.transform(hot, real_data)

# check
schema(data_hot_encoded)
┌ Info: Training machine(OneHotEncoder(features = Symbol[], …), …).
└ @ MLJBase /Users/nathanielzhu/.julia/packages/MLJBase/mIaqI/src/machines.jl:493
┌───────────────────┬────────────┬─────────┐
│ names             │ scitypes   │ types   │
├───────────────────┼────────────┼─────────┤
│ Gender            │ Count      │ Int64   │
│ Married           │ Count      │ Int64   │
│ Dependents        │ Count      │ Int64   │
│ Education         │ Count      │ Int64   │
│ Self_Employed     │ Count      │ Int64   │
│ ApplicantIncome   │ Count      │ Int64   │
│ CoapplicantIncome │ Continuous │ Float64 │
│ LoanAmount        │ Continuous │ Float64 │
│ Loan_Amount_Term  │ Continuous │ Float64 │
│ Credit_History    │ Continuous │ Float64 │
│ Property_Area     │ Count      │ Int64   │
│ Loan_Status       │ Count      │ Int64   │
└───────────────────┴────────────┴─────────┘
In [ ]:
rng = StableRNG(1997)

train, test = partition(data_hot_encoded, 0.7; rng = rng, shuffle = true)
(216×12 DataFrame
 Row │ Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  ⋯
     │ Int64   Int64    Int64       Int64      Int64          Int64            ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │      0        0           0          0              0             2333  ⋯
   2 │      1        1           1          1              0             4608
   3 │      0        0           0          1              1             7167
   4 │      1        0           0          1              0             3159
   5 │      1        0           1          0              0             4606  ⋯
   6 │      0        1           0          1              0             3232
   7 │      0        1           2          1              0             5935
   8 │      1        0           0          0              0             3400
  ⋮  │   ⋮        ⋮         ⋮           ⋮            ⋮               ⋮         ⋱
 210 │      0        1           3          1              0             3400  ⋯
 211 │      0        1           0          0              0             3814
 212 │      0        1           2          1              0             4400
 213 │      0        1           0          1              0             5488
 214 │      1        0           0          1              0             1811  ⋯
 215 │      0        1           1          0              0             3500
 216 │      1        0           0          1              0             5000
                                                  6 columns and 201 rows omitted, 92×12 DataFrame
 Row │ Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  ⋯
     │ Int64   Int64    Int64       Int64      Int64          Int64            ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │      0        0           0          1              0             6277  ⋯
   2 │      0        1           3          0              1             7100
   3 │      0        0           0          1              0             2500
   4 │      0        1           1          1              0             2750
   5 │      1        1           0          1              0             2484  ⋯
   6 │      0        1           0          1              0             3246
   7 │      0        0           0          1              0             6000
   8 │      0        1           0          1              0             2958
  ⋮  │   ⋮        ⋮         ⋮           ⋮            ⋮               ⋮         ⋱
  86 │      1        0           0          1              0             3762  ⋯
  87 │      0        1           0          1              0             4860
  88 │      0        1           0          1              0             3597
  89 │      1        0           1          1              1             8624
  90 │      0        1           1          1              0             2882  ⋯
  91 │      1        1           0          0              1             7142
  92 │      0        1           3          1              0             8750
                                                   6 columns and 77 rows omitted)
In [ ]:
y = categorical((real_data[!, :Loan_Status]), levels = [0,1])
X = select(real_data, Not([:Loan_Status]))
levels(y)
2-element Vector{Int64}:
 0
 1
In [ ]:
train, test = partition(eachindex(y), 0.8, shuffle=true, rng=1997);
In [ ]:
# checking best models for our work

for m in models(matching(X, y))
    println("""
    [Model: $(m.name)]
    \t prediction type: $(m.prediction_type)
    \t source package:  $(m.package_name)
    """)
end
[Model: AdaBoostStumpClassifier]
	 prediction type: probabilistic
	 source package:  DecisionTree

[Model: CatBoostClassifier]
	 prediction type: probabilistic
	 source package:  CatBoost

[Model: ConstantClassifier]
	 prediction type: probabilistic
	 source package:  MLJModels

[Model: DecisionTreeClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: DecisionTreeClassifier]
	 prediction type: probabilistic
	 source package:  DecisionTree

[Model: DeterministicConstantClassifier]
	 prediction type: deterministic
	 source package:  MLJModels

[Model: EvoTreeClassifier]
	 prediction type: probabilistic
	 source package:  EvoTrees

[Model: KernelPerceptronClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: NeuralNetworkClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: PegasosClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: PerceptronClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: RandomForestClassifier]
	 prediction type: probabilistic
	 source package:  BetaML

[Model: RandomForestClassifier]
	 prediction type: probabilistic
	 source package:  DecisionTree

[Model: RandomForestClassifier]
	 prediction type: probabilistic
	 source package:  MLJScikitLearnInterface

[Model: StableForestClassifier]
	 prediction type: probabilistic
	 source package:  SIRUS

[Model: StableRulesClassifier]
	 prediction type: probabilistic
	 source package:  SIRUS

Here are all the models that would works in this project:

[Model: AdaBoostStumpClassifier] prediction type: probabilistic source package: DecisionTree

[Model: CatBoostClassifier] prediction type: probabilistic source package: CatBoost

[Model: ConstantClassifier] prediction type: probabilistic source package: MLJModels

[Model: DecisionTreeClassifier] prediction type: probabilistic source package: BetaML

[Model: DecisionTreeClassifier] prediction type: probabilistic source package: DecisionTree

[Model: DeterministicConstantClassifier] prediction type: deterministic source package: MLJModels

[Model: EvoTreeClassifier] prediction type: probabilistic source package: EvoTrees

[Model: KernelPerceptronClassifier] prediction type: probabilistic source package: BetaML

[Model: NeuralNetworkClassifier] prediction type: probabilistic source package: BetaML

[Model: PegasosClassifier] prediction type: probabilistic source package: BetaML

[Model: PerceptronClassifier] prediction type: probabilistic source package: BetaML

[Model: RandomForestClassifier] prediction type: probabilistic source package: BetaML

[Model: RandomForestClassifier] prediction type: probabilistic source package: DecisionTree

[Model: RandomForestClassifier] prediction type: probabilistic source package: MLJScikitLearnInterface

[Model: StableForestClassifier] prediction type: probabilistic source package: SIRUS

[Model: StableRulesClassifier] prediction type: probabilistic source package: SIRUS

In [ ]:
KNNClassifier = @load KNNClassifier verbosity = 0
LDA = @load LDA verbosity = 0
NeuralNetworkClassifier = @load NeuralNetworkClassifier pkg = MLJFlux verbosity = 0
MultinomialClassifier = @load MultinomialClassifier verbosity = 0
CatBoostClassifier = @load CatBoostClassifier verbosity = 0 pkg = CatBoost verbosity = 0
RandomForestClassifier = @load RandomForestClassifier pkg=BetaML verbosity = 0
DecisionTreeClassifier = @load DecisionTreeClassifier pkg = DecisionTree verbosity = 0

model_list = [
    KNNClassifier(K = 5), # use nearest 5-neighbors to make predictions
    LDA(),
    NeuralNetworkClassifier(),
    MultinomialClassifier(),
    CatBoostClassifier(),
    RandomForestClassifier(),
    DecisionTreeClassifier()
]
7-element Vector{Probabilistic}:
 KNNClassifier(K = 5, …)
 LDA(method = gevd, …)
 NeuralNetworkClassifier(builder = Short(n_hidden = 0, …), …)
 MultinomialClassifier(lambda = 2.220446049250313e-16, …)
 CatBoostClassifier(iterations = 1000, …)
 RandomForestClassifier(n_trees = 30, …)
 DecisionTreeClassifier(max_depth = -1, …)
In [ ]:
acc = Float64[] # accuracy()
pre = Float64[] # multiclass_precision()
rec = Float64[] # multiclass_recall()
f1s = Float64[] # f1score()
mat = []        # confusion_matrix()
Any[]
In [ ]:
for clf in model_list
    # Create a pipeline model that standardizes, then fits a classifier.
    model = Pipeline(Standardizer(), clf)

    # Fit the model onto the training set
    mach = machine(model, X, y)
    MLJ.fit!(mach, rows = train, verbosity = 0)

    # Make predictions on the test set
    yhat = MLJ.predict(mach, rows = test)

    # Evaluate the model on the test set using selected metrics
    #
    # NOTES:
    #
    #   - MLJ.predict() may give probabilistic predictions. Use mode() to collapse to a concrete target.
    #   - An evaluation metric F() always accepts inputs as F(fitted, observed).
    
    push!(acc, accuracy(mode.(yhat), y[test]))
    push!(pre, multiclass_precision(mode.(yhat), y[test]))
    push!(rec, multiclass_recall(mode.(yhat), y[test]))
    push!(f1s, f1score(mode.(yhat), y[test]))
    push!(mat, ConfusionMatrix(levels = levels(y))(mode.(yhat), y[test]))
    
end
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1. 
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1. 
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
┌ Warning: Layer with Float32 parameters got Float64 input.
│   The input will be converted, but any earlier layers may be very slow.
│   layer = Dense(11 => 5, σ)
│   summary(x) = 11×1 Matrix{Float64}
└ @ Flux /Users/nathanielzhu/.julia/packages/Flux/vzwqj/src/layers/stateless.jl:60
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1. 
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
In [ ]:
results = DataFrame(
    Model = typeof.(model_list),
    Accuracy = acc,
    Precision = pre,
    Recall = rec,
    F1 = f1s
)
UndefVarError: `model_list` not defined



Stacktrace:

 [1] top-level scope

   @ ~/Desktop/UCR/STAT206/Coding/Project/Project_Code.ipynb:1
In [ ]:
@show mat[2]
In [ ]:
glm_df = DataFrame(copy(real_data))

glm_df[!, :Credit_History] = CategoricalArray(glm_df[!, :Credit_History])
glm_df[!, :Property_Area] = CategoricalArray(glm_df[!, :Property_Area])
glm_df[!, :Loan_Status] = CategoricalArray(glm_df[!, :Loan_Status])
glm_df[!, :Education] = CategoricalArray(glm_df[!, :Education])
glm_df[!, :Gender] = CategoricalArray(glm_df[!, :Gender])
glm_df[!, :Self_Employed] = CategoricalArray(glm_df[!, :Self_Employed])

# Define the formula for logistic regression
formula = @formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area)

# Fit the logistic regression model
# model = lm(@formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area), glm_df)

probit = glm(@formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area), glm_df, Binomial(), ProbitLink())

# Print the model summary
println(model)